setwd("C:/Users/CarlinML/DACSS-690R/Second_Deliverable")
getwd()
## [1] "C:/Users/CarlinML/DACSS-690R/Second_Deliverable"
FIRST DATASET - CLEANING
Remove leading and trailing spaces
SMMH_dirty[,]=sapply(SMMH_dirty[,],trimws)
SMMH_clean=SMMH_dirty[,]
View(SMMH_clean)
str(SMMH_clean)
## 'data.frame': 481 obs. of 21 variables:
## $ Timestamp : chr "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
## $ 1. What is your age? : chr "21" "21" "21" "21" ...
## $ 2. Gender : chr "Male" "Female" "Female" "Female" ...
## $ 3. Relationship Status : chr "In a relationship" "Single" "Single" "Single" ...
## $ 4. Occupation Status : chr "University Student" "University Student" "University Student" "University Student" ...
## $ 5. What type of organizations are you affiliated with? : chr "University" "University" "University" "University" ...
## $ 6. Do you use social media? : chr "Yes" "Yes" "Yes" "Yes" ...
## $ 7. What social media platforms do you commonly use? : chr "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
## $ 8. What is the average time you spend on social media every day? : chr "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
## $ 9. How often do you find yourself using Social media without a specific purpose? : chr "5" "4" "3" "4" ...
## $ 10. How often do you get distracted by Social media when you are busy doing something? : chr "3" "3" "2" "2" ...
## $ 11. Do you feel restless if you haven't used Social media in a while? : chr "2" "2" "1" "1" ...
## $ 12. On a scale of 1 to 5, how easily distracted are you? : chr "5" "4" "2" "3" ...
## $ 13. On a scale of 1 to 5, how much are you bothered by worries? : chr "2" "5" "5" "5" ...
## $ 14. Do you find it difficult to concentrate on things? : chr "5" "4" "4" "3" ...
## $ 15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?: chr "2" "5" "3" "5" ...
## $ 16. Following the previous question, how do you feel about these comparisons, generally speaking? : chr "3" "1" "3" "1" ...
## $ 17. How often do you look to seek validation from features of social media? : chr "2" "1" "1" "2" ...
## $ 18. How often do you feel depressed or down? : chr "5" "5" "4" "4" ...
## $ 19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate? : chr "4" "4" "2" "3" ...
## $ 20. On a scale of 1 to 5, how often do you face issues regarding sleep? : chr "5" "5" "5" "2" ...
Fix column names (variable names were too cumbersome when I tried
replacing spaces with underscores, so I decided to use the following
code to rename the variables instead)
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
SMMH_clean <- rename(SMMH_clean,
Age = "1. What is your age?",
Gender = "2. Gender",
RelStatus = "3. Relationship Status",
OccStatus = "4. Occupation Status",
OrgAffil = "5. What type of organizations are you affiliated with?",
UseSocialMedia = "6. Do you use social media?",
Platforms = "7. What social media platforms do you commonly use?",
AmtTime = "8. What is the average time you spend on social media every day?",
WoutPurpose = "9. How often do you find yourself using Social media without a specific purpose?",
Distracted = "10. How often do you get distracted by Social media when you are busy doing something?",
Restless = "11. Do you feel restless if you haven't used Social media in a while?",
EasilyDist = "12. On a scale of 1 to 5, how easily distracted are you?",
Bothered = "13. On a scale of 1 to 5, how much are you bothered by worries?",
DiffConcen = "14. Do you find it difficult to concentrate on things?",
Compare = "15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?",
Comparisons = "16. Following the previous question, how do you feel about these comparisons, generally speaking?",
Validation = "17. How often do you look to seek validation from features of social media?",
Depressed = "18. How often do you feel depressed or down?",
Interest = "19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?",
Sleep = "20. On a scale of 1 to 5, how often do you face issues regarding sleep?")
Run frequency tables of categorical variables to see if any recoding
is needed
table(SMMH_clean$Gender)
##
## Female Male NB Non-binary
## 263 211 1 1
## Non binary Nonbinary There are others??? Trans
## 1 1 1 1
## unsure
## 1
SMMH_clean[SMMH_clean$Gender=='NB','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='Non binary','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='Nonbinary','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='unsure','Gender']='NA'
SMMH_clean[SMMH_clean$Gender=='There are others???','Gender']='NA'
table(SMMH_clean$Gender)
##
## Female Male NA Non-binary Trans
## 263 211 2 4 1
Additional freq tables and mispelling correction
table(SMMH_clean$RelStatus) # no cleaning needed
##
## Divorced In a relationship Married Single
## 7 88 101 285
table(SMMH_clean$OccStatus) # no cleaning needed
##
## Retired Salaried Worker School Student University Student
## 8 132 49 292
table(SMMH_clean$OrgAffil) # clean up N/As
##
## Company Company, Private
## 30 7
## Goverment N/A
## 6 30
## Private School
## 60 44
## School, Company School, N/A
## 2 2
## School, Private School, University
## 1 9
## School, University, Private University
## 4 239
## University, Company University, Company, Goverment
## 19 1
## University, Company, Private University, Goverment
## 5 2
## University, Goverment, Private University, N/A
## 1 3
## University, Private
## 16
SMMH_clean$OrgAffil <- gsub("Goverment", "Government",
gsub("N/A", "NA",
gsub(", N/A", "", SMMH_clean$OrgAffil)))
table(SMMH_clean$OrgAffil)
##
## Company Company, Private
## 30 7
## Government NA
## 6 30
## Private School
## 60 46
## School, Company School, Private
## 2 1
## School, University School, University, Private
## 9 4
## University University, Company
## 242 19
## University, Company, Government University, Company, Private
## 1 5
## University, Government University, Government, Private
## 2 1
## University, Private
## 16
Additional freq tables
table(SMMH_clean$UseSocialMedia) # no cleaning needed
##
## No Yes
## 3 478
table(SMMH_clean$Platforms) # may need to clean but would be easier if data was formatted differently
##
## Discord
## 1
## Discord, Reddit
## 3
## Facebook
## 18
## Facebook, Discord
## 1
## Facebook, Discord, Reddit
## 1
## Facebook, Instagram
## 10
## Facebook, Instagram, Discord
## 2
## Facebook, Instagram, Pinterest
## 1
## Facebook, Instagram, Reddit, TikTok
## 1
## Facebook, Instagram, Snapchat
## 3
## Facebook, Instagram, Snapchat, Discord, Pinterest, TikTok
## 1
## Facebook, Instagram, Snapchat, Pinterest, TikTok
## 1
## Facebook, Instagram, TikTok
## 2
## Facebook, Instagram, YouTube
## 35
## Facebook, Instagram, YouTube, Discord
## 18
## Facebook, Instagram, YouTube, Discord, Pinterest
## 7
## Facebook, Instagram, YouTube, Discord, Pinterest, TikTok
## 2
## Facebook, Instagram, YouTube, Discord, Reddit
## 4
## Facebook, Instagram, YouTube, Discord, Reddit, Pinterest
## 2
## Facebook, Instagram, YouTube, Discord, Reddit, TikTok
## 1
## Facebook, Instagram, YouTube, Pinterest
## 16
## Facebook, Instagram, YouTube, Pinterest, TikTok
## 2
## Facebook, Instagram, YouTube, Reddit
## 3
## Facebook, Instagram, YouTube, Reddit, Pinterest
## 1
## Facebook, Instagram, YouTube, Reddit, Pinterest, TikTok
## 1
## Facebook, Instagram, YouTube, Reddit, TikTok
## 1
## Facebook, Instagram, YouTube, Snapchat
## 28
## Facebook, Instagram, YouTube, Snapchat, Discord
## 19
## Facebook, Instagram, YouTube, Snapchat, Discord, Pinterest
## 4
## Facebook, Instagram, YouTube, Snapchat, Discord, Pinterest, TikTok
## 5
## Facebook, Instagram, YouTube, Snapchat, Discord, Reddit
## 4
## Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest
## 5
## Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok
## 5
## Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok
## 2
## Facebook, Instagram, YouTube, Snapchat, Discord, TikTok
## 7
## Facebook, Instagram, YouTube, Snapchat, Pinterest
## 8
## Facebook, Instagram, YouTube, Snapchat, Pinterest, TikTok
## 7
## Facebook, Instagram, YouTube, Snapchat, Reddit, Pinterest, TikTok
## 2
## Facebook, Instagram, YouTube, Snapchat, TikTok
## 3
## Facebook, Instagram, YouTube, TikTok
## 4
## Facebook, Snapchat, Reddit
## 1
## Facebook, TikTok
## 1
## Facebook, Twitter
## 1
## Facebook, Twitter, Instagram, Snapchat, Reddit
## 1
## Facebook, Twitter, Instagram, YouTube
## 14
## Facebook, Twitter, Instagram, YouTube, Discord
## 3
## Facebook, Twitter, Instagram, YouTube, Discord, Pinterest
## 2
## Facebook, Twitter, Instagram, YouTube, Discord, Pinterest, TikTok
## 1
## Facebook, Twitter, Instagram, YouTube, Discord, Reddit
## 4
## Facebook, Twitter, Instagram, YouTube, Discord, Reddit, Pinterest
## 2
## Facebook, Twitter, Instagram, YouTube, Discord, Reddit, Pinterest, TikTok
## 1
## Facebook, Twitter, Instagram, YouTube, Discord, Reddit, TikTok
## 2
## Facebook, Twitter, Instagram, YouTube, Pinterest
## 7
## Facebook, Twitter, Instagram, YouTube, Reddit
## 3
## Facebook, Twitter, Instagram, YouTube, Reddit, Pinterest
## 3
## Facebook, Twitter, Instagram, YouTube, Reddit, Pinterest, TikTok
## 1
## Facebook, Twitter, Instagram, YouTube, Snapchat
## 5
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord
## 4
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Pinterest
## 5
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Pinterest, TikTok
## 4
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit
## 8
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest
## 6
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok
## 11
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok
## 1
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, TikTok
## 5
## Facebook, Twitter, Instagram, YouTube, Snapchat, Pinterest
## 6
## Facebook, Twitter, Instagram, YouTube, Snapchat, Reddit, Pinterest
## 1
## Facebook, Twitter, Instagram, YouTube, Snapchat, Reddit, TikTok
## 1
## Facebook, Twitter, Instagram, YouTube, Snapchat, TikTok
## 3
## Facebook, Twitter, YouTube
## 1
## Facebook, Twitter, YouTube, Discord
## 1
## Facebook, Twitter, YouTube, Discord, Pinterest
## 1
## Facebook, Twitter, YouTube, Discord, Reddit
## 1
## Facebook, Twitter, YouTube, Pinterest
## 1
## Facebook, YouTube
## 30
## Facebook, YouTube, Discord
## 6
## Facebook, YouTube, Discord, Pinterest
## 2
## Facebook, YouTube, Discord, Reddit
## 5
## Facebook, YouTube, Discord, Reddit, Pinterest
## 1
## Facebook, YouTube, Pinterest
## 5
## Facebook, YouTube, Reddit
## 2
## Facebook, YouTube, Snapchat
## 2
## Facebook, YouTube, Snapchat, Discord
## 1
## Facebook, YouTube, Snapchat, Pinterest
## 1
## Facebook, YouTube, TikTok
## 3
## Instagram
## 5
## Instagram, Discord
## 1
## Instagram, Discord, Reddit, Pinterest, TikTok
## 1
## Instagram, Reddit
## 1
## Instagram, YouTube
## 3
## Instagram, YouTube, Discord
## 2
## Instagram, YouTube, Discord, Pinterest
## 1
## Instagram, YouTube, Discord, Reddit
## 2
## Instagram, YouTube, Discord, Reddit, Pinterest
## 1
## Instagram, YouTube, Discord, Reddit, TikTok
## 1
## Instagram, YouTube, Pinterest
## 1
## Instagram, YouTube, Reddit, Pinterest
## 1
## Instagram, YouTube, Snapchat, Discord, Reddit
## 2
## Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok
## 1
## Instagram, YouTube, Snapchat, Pinterest
## 1
## Pinterest
## 2
## Reddit
## 4
## Reddit, Pinterest
## 1
## TikTok
## 1
## Twitter
## 1
## Twitter, Discord, Reddit
## 2
## Twitter, Instagram, TikTok
## 1
## Twitter, Instagram, YouTube
## 3
## Twitter, Instagram, YouTube, Discord, Pinterest
## 1
## Twitter, Instagram, YouTube, Reddit, TikTok
## 3
## Twitter, Instagram, YouTube, Snapchat, Discord, Reddit
## 1
## Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok
## 1
## Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok
## 2
## Twitter, Instagram, YouTube, Snapchat, Reddit, Pinterest, TikTok
## 1
## Twitter, Instagram, YouTube, TikTok
## 1
## Twitter, YouTube
## 2
## Twitter, YouTube, Discord, Reddit
## 1
## Twitter, YouTube, Reddit
## 1
## YouTube
## 6
## YouTube, Discord
## 1
## YouTube, Discord, Reddit
## 4
## YouTube, Pinterest
## 1
## YouTube, Reddit
## 3
## YouTube, Snapchat, Discord
## 1
## YouTube, Snapchat, Discord, Reddit
## 1
table(SMMH_clean$AmtTime) # no cleaning needed
##
## Between 1 and 2 hours Between 2 and 3 hours Between 3 and 4 hours
## 70 101 93
## Between 4 and 5 hours Less than an Hour More than 5 hours
## 67 34 116
Save cleaned file to new folder
folder <- "DataCleanAndFormatted"
# Check if the folder exists
if (!dir.exists(folder)) {
# Create the folder
dir.create(folder)
write.csv(SMMH_clean,file.path(folder,"SMMH_clean.csv"),row.names = F)
} else {
write.csv(SMMH_clean,file.path(folder,"SMMH_clean.csv"),row.names = F)}
Nominal variables - create new columns that are factors
SMMH_clean$Gender_label <- as.factor(SMMH_clean$Gender)
SMMH_clean$RelStatus_label <- as.factor(SMMH_clean$RelStatus)
SMMH_clean$OccStatus_label <- as.factor(SMMH_clean$OccStatus)
SMMH_clean$OrgAffil_label <- as.factor(SMMH_clean$OrgAffil)
SMMH_clean$UseSocialMedia_label <- as.factor(SMMH_clean$UseSocialMedia)
str(SMMH_clean)
## 'data.frame': 481 obs. of 26 variables:
## $ Timestamp : chr "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
## $ Age : num 21 21 21 21 21 22 21 21 21 20 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ RelStatus : chr "In a relationship" "Single" "Single" "Single" ...
## $ OccStatus : chr "University Student" "University Student" "University Student" "University Student" ...
## $ OrgAffil : chr "University" "University" "University" "University" ...
## $ UseSocialMedia : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Platforms : chr "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
## $ AmtTime : chr "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
## $ WoutPurpose : int 5 4 3 4 3 4 4 5 5 1 ...
## $ Distracted : int 3 3 2 2 5 4 3 2 2 1 ...
## $ Restless : int 2 2 1 1 4 2 2 3 3 1 ...
## $ EasilyDist : int 5 4 2 3 4 3 2 3 3 1 ...
## $ Bothered : int 2 5 5 5 5 4 4 3 1 1 ...
## $ DiffConcen : int 5 4 4 3 5 3 3 1 1 1 ...
## $ Compare : int 2 5 3 5 3 4 5 1 1 1 ...
## $ Comparisons : int 3 1 3 1 3 4 3 3 3 1 ...
## $ Validation : int 2 1 1 2 3 3 4 1 1 1 ...
## $ Depressed : int 5 5 4 4 4 3 5 5 5 1 ...
## $ Interest : int 4 4 2 3 4 2 5 5 5 1 ...
## $ Sleep : int 5 5 5 2 1 4 3 1 1 1 ...
## $ Gender_label : Factor w/ 4 levels "Female","Male",..: 2 1 1 1 1 1 1 1 1 2 ...
## $ RelStatus_label : Factor w/ 4 levels "Divorced","In a relationship",..: 2 4 4 4 4 4 3 2 2 4 ...
## $ OccStatus_label : Factor w/ 4 levels "Retired","Salaried Worker",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ OrgAffil_label : Factor w/ 16 levels "Company","Company, Private",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ UseSocialMedia_label: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
Ordinal ‘amount of time’ variable
# map for replacement: 1 the lowest # hours / 6 the highest # hours
map_AmtTime <- c("Less than an Hour"=1, "Between 1 and 2 hours"=2,"Between 2 and 3 hours"=3, "Between 3 and 4 hours"=4, "Between 4 and 5 hours"=5, "More than 5 hours"=6)
SMMH_clean$AmtTime_int=map_AmtTime[SMMH_clean$AmtTime]
SMMH_clean$AmtTime_label <- factor(SMMH_clean$AmtTime_int, levels = seq(1,6),labels = c('1_Less than an Hour','2_Between 1 and 2 hours','3_Between 2 and 3 hours','4_Between 3 and 4 hours','5_Between 4 and 5 hours', '6_More than 5 hours'),ordered = TRUE)
Ordinal likert scale items
theInts=seq(1,5)
theLabels=c('1_Strongly Disagree','2_Disagree','3_Neutral','4_Agree','5_Strongly Agree')
FormatOrdinal=function(col) factor(col,
levels = theInts,
labels = theLabels,
ordered = TRUE)
names <- c(10:21)
SMMH_clean[names]=lapply(SMMH_clean[names],FormatOrdinal)
str(SMMH_clean)
## 'data.frame': 481 obs. of 28 variables:
## $ Timestamp : chr "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
## $ Age : num 21 21 21 21 21 22 21 21 21 20 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ RelStatus : chr "In a relationship" "Single" "Single" "Single" ...
## $ OccStatus : chr "University Student" "University Student" "University Student" "University Student" ...
## $ OrgAffil : chr "University" "University" "University" "University" ...
## $ UseSocialMedia : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Platforms : chr "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
## $ AmtTime : chr "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
## $ WoutPurpose : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 3 4 3 4 4 5 5 1 ...
## $ Distracted : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 3 3 2 2 5 4 3 2 2 1 ...
## $ Restless : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 2 1 1 4 2 2 3 3 1 ...
## $ EasilyDist : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 2 3 4 3 2 3 3 1 ...
## $ Bothered : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 5 5 5 5 4 4 3 1 1 ...
## $ DiffConcen : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 4 3 5 3 3 1 1 1 ...
## $ Compare : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 5 3 5 3 4 5 1 1 1 ...
## $ Comparisons : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 3 1 3 1 3 4 3 3 3 1 ...
## $ Validation : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 1 1 2 3 3 4 1 1 1 ...
## $ Depressed : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 5 4 4 4 3 5 5 5 1 ...
## $ Interest : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 4 4 2 3 4 2 5 5 5 1 ...
## $ Sleep : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 5 5 2 1 4 3 1 1 1 ...
## $ Gender_label : Factor w/ 4 levels "Female","Male",..: 2 1 1 1 1 1 1 1 1 2 ...
## $ RelStatus_label : Factor w/ 4 levels "Divorced","In a relationship",..: 2 4 4 4 4 4 3 2 2 4 ...
## $ OccStatus_label : Factor w/ 4 levels "Retired","Salaried Worker",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ OrgAffil_label : Factor w/ 16 levels "Company","Company, Private",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ UseSocialMedia_label: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
## $ AmtTime_int : num 3 6 4 6 3 3 4 6 6 1 ...
## $ AmtTime_label : Ord.factor w/ 6 levels "1_Less than an Hour"<..: 3 6 4 6 3 3 4 6 6 1 ...
Save as RDS
folder = "DataCleanAndFormatted"
# Check if the folder exists
if (!dir.exists(folder)) {
# Create the folder
dir.create(folder)
saveRDS(SMMH_clean,file.path(folder,"SMMH_formatted.RDS"))
} else {
saveRDS(SMMH_clean,file.path(folder,"SMMH_formatted.RDS"))
}
SECOND DATASET - CLEANING
Clerkship grades; need to skip first line in excel file
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
destfile <- path.expand("ClerkshipGrades.xlsx")
url <- paste0("https://github.com/DACSS-690R/First_Deliverable/raw/refs/heads/main/dataFiles/UMassChan_ClerkshipGrades.xlsx")
download.file(url, destfile = destfile, mode = "wb")
ClerkshipGrades_dirty <- read_xlsx(destfile, skip=1)
## New names:
## • `Session` -> `Session...6`
## • `Session` -> `Session...7`
ClerkshipGrades_dirty
Remove leading and trailing spaces
ClerkshipGrades_dirty[,]=sapply(ClerkshipGrades_dirty[,],trimws)
ClerkshipGrades_clean=ClerkshipGrades_dirty[,]
View(ClerkshipGrades_clean)
str(ClerkshipGrades_clean)
## tibble [6,427 × 17] (S3: tbl_df/tbl/data.frame)
## $ Term : chr [1:6427] "5300" "5301" "5301" "5301" ...
## $ ID : chr [1:6427] "1793" "1793" "1793" "1793" ...
## $ Subject : chr [1:6427] "OB" "PS" "FC" "PE" ...
## $ Catalog : chr [1:6427] "300" "300" "300" "300" ...
## $ Title : chr [1:6427] "Clerkship in Ob/Gyn" "Clerkship in Psychiatry" "Clerkship in Family Medicine" "Clerkship in Pediatrics" ...
## $ Session...6 : chr [1:6427] "Core Clinical Exp Section 1C" "Core Clinical Exp Section 3A" "Core Clinical Exp Section 3B" "Core Clinical Exp Section 3C" ...
## $ Session...7 : chr [1:6427] "S1C" "S3A" "S3B" "S3C" ...
## $ Location : chr [1:6427] "CCH" "TARAVISTA" "WSO" "UMM" ...
## $ Location_Description: chr [1:6427] "Cape Cod Hospital" "TaraVista Behavioral Health Center" "Worcester Area South" "UMMHC-University Campus" ...
## $ Final_Letter : chr [1:6427] "H" "HH" "H" "HH" ...
## $ Final_Numeric : chr [1:6427] "83" "91" "92.3" "3.7" ...
## $ SPE_Letter : chr [1:6427] "HONORS" "High Honors" "High Honors" "High Honors" ...
## $ SPE_Numeric : chr [1:6427] "37" "96" "43.75" "3.55" ...
## $ NBME-Written_Letter : chr [1:6427] "Honors" "Pass" "Honors" "Honors" ...
## $ NBME-Written_Numeric: chr [1:6427] "11" "78" "18" "87" ...
## $ OSCE_Letter : chr [1:6427] "Honors" "High Honors" "Honors" "High Honors" ...
## $ OSCE_Numeric : chr [1:6427] "20" "93" "16.45" "93" ...
Remove dashes from two variable names
ClerkshipGrades_clean <- rename(ClerkshipGrades_clean,
NBME_Letter = `NBME-Written_Letter`,
NBME_Numeric = `NBME-Written_Numeric`)
Run frequency tables of categorical variables to see if any recoding
is needed
table(ClerkshipGrades_clean$Subject) # no cleaning needed
##
## FC ME NU OB PE PS SU
## 942 790 934 960 938 938 925
table(ClerkshipGrades_clean$Catalog) # no cleaning needed
##
## 300 300B 302
## 4703 790 934
table(ClerkshipGrades_clean$Session) # no cleaning needed
## Warning: Unknown or uninitialised column: `Session`.
## < table of extent 0 >
table(ClerkshipGrades_clean$Location) # no cleaning needed
##
## BAY BAYP BRK BRKA BST CCH CMH DR.JCMHC
## 269 809 373 3 14 495 2 51
## EAST EMHS.MEM FAL HBM HMC LAHE MAR MDL
## 101 1 43 19 16 51 88 17
## MEM MIL PIO POCMH PTC SEA ST.LH STV
## 632 410 10 8 28 28 4 506
## TARAVISTA TASH UMCS UMM UMME UMMHC.ACS UMMHC.CR UMMHC.HA
## 113 22 115 1020 98 44 72 42
## UMMHC.MIS UMMHC.SO UMMHCCL.A UMMHCCL.P WNO WRCA WRCHAU WSO
## 42 42 2 3 304 52 29 449
table(ClerkshipGrades_clean$Final_Letter)
##
## F H HH I P S
## 3 3782 2221 5 305 111
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
mutate(Final_Letter2 = recode(Final_Letter,
"HH" = "High Honors",
"H" = "Honors",
"P" = "Pass",
"S" = "Satisfactory",
"F" = "Fail",
"I" = "Incomplete"))
table(ClerkshipGrades_clean$Final_Letter2)
##
## Fail High Honors Honors Incomplete Pass Satisfactory
## 3 2221 3782 5 305 111
table(ClerkshipGrades_clean$SPE_Letter)
##
## High Honors Honors HONORS Incomplete Pass
## 3853 2060 4 1 133
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
mutate(SPE_Letter2 = recode(SPE_Letter,
"HONORS" = "Honors"))
table(ClerkshipGrades_clean$SPE_Letter2)
##
## High Honors Honors Incomplete Pass
## 3853 2064 1 133
table(ClerkshipGrades_clean$NBME_Letter)
##
## Fail High Honors HIGH HONORS Honors HONORS Incomplete
## 3 1183 11 2437 15 5
## Pass PASS Satisfactory
## 1805 17 37
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
mutate(NBME_Letter2 = recode(NBME_Letter,
"HIGH HONORS" = "High Honors",
"HONORS" = "Honors",
"PASS" = "Pass"))
table(ClerkshipGrades_clean$NBME_Letter2)
##
## Fail High Honors Honors Incomplete Pass Satisfactory
## 3 1194 2452 5 1822 37
table(ClerkshipGrades_clean$OSCE_Letter)
##
## Fail High Honors HIGH HONORS Honors HONORS Incomplete
## 3 2199 1 2964 8 1
## Pass PASS Satisfactory
## 1187 9 54
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
mutate(OSCE_Letter2 = recode(OSCE_Letter,
"HIGH HONORS" = "High Honors",
"HONORS" = "Honors",
"PASS" = "Pass"))
table(ClerkshipGrades_clean$OSCE_Letter2)
##
## Fail High Honors Honors Incomplete Pass Satisfactory
## 3 2200 2972 1 1196 54
Cleaning numeric variables
ClerkshipGrades_clean[!complete.cases(ClerkshipGrades_clean),]
ClerkshipGrades_clean[1,]
colSums(is.na(apply(ClerkshipGrades_clean[,c(10,12,14,16)],2, as.numeric)))
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Final_Letter SPE_Letter NBME_Letter OSCE_Letter
## 6427 6427 6427 6427
detectWrongNA= function(col){col[grep("[^\\d+\\.*\\d*]", col, perl=T,invert = F)]}
badSymbolNum=sapply(ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')],detectWrongNA)
badSymbolNum_unlist=unlist(badSymbolNum)
badSymbolNum_vector=unique(badSymbolNum_unlist)
badSymbolNum_vector
## [1] "4 (7" "4 (9" "4 (8" "11,4" "12,7"
ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')]=lapply(ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')],function(col) ifelse((col %in% badSymbolNum_vector), NA, col))
ClerkshipGrades_clean
Save cleaned file to new folder
folder <- "DataCleanAndFormatted"
# Check if the folder exists
if (!dir.exists(folder)) {
# Create the folder
dir.create(folder)
write.csv(SMMH_clean,file.path(folder,"ClerkshipGrades_clean.csv"))
} else {
write.csv(ClerkshipGrades_clean,file.path(folder,"ClerkshipGrades_clean.csv"))}
Change all Letter Grade columns to uppercase, and conver to ordered
factors
ClerkshipGrades_clean[,16:19] <- lapply(ClerkshipGrades_clean[,16:19],toupper)
Likert_cols <- c(16:19)
ClerkshipGrades_clean[,Likert_cols] <- lapply(ClerkshipGrades_clean[,Likert_cols] , factor, ordered = TRUE, levels = c("FAIL", "INCOMPLETE", "SATISFACTORY", "PASS", "HONORS", "HIGH HONORS"))
str(ClerkshipGrades_clean)
## 'data.frame': 6427 obs. of 19 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Term : int 5300 5301 5301 5301 5300 5300 5300 5000 5001 5001 ...
## $ Student_ID : int 1793 1793 1793 1793 1793 1793 1793 1379 1379 1379 ...
## $ Subject : chr "OB" "PS" "FC" "PE" ...
## $ Catalog : chr "300" "300" "300" "300" ...
## $ Session : chr "S1C" "S3A" "S3B" "S3C" ...
## $ Location : chr "Cape Cod Hospital" "TaraVista Behavioral Health Center" "Worcester Area South" "UMMHC-University Campus" ...
## $ Final_Letter : chr "H" "HH" "H" "HH" ...
## $ Final_Numeric: num 83 91 92.3 3.7 3 3.3 95.8 NA NA NA ...
## $ SPE_Letter : chr "HONORS" "High Honors" "High Honors" "High Honors" ...
## $ SPE_Numeric : num 37 96 43.75 3.55 3.6 ...
## $ NBME_Letter : chr "Honors" "Pass" "Honors" "Honors" ...
## $ NBME_Numeric : num 11 78 18 87 67 2.1 75 11 82 12.1 ...
## $ OSCE_Letter : chr "Honors" "High Honors" "Honors" "High Honors" ...
## $ OSCE_Numeric : num 20 93 16.4 93 62 ...
## $ Final_Letter2: Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 5 6 5 5 6 5 5 5 ...
## $ SPE_Letter2 : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 6 6 6 6 6 5 NA 6 ...
## $ NBME_Letter2 : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 4 5 5 4 4 5 5 5 NA ...
## $ OSCE_Letter2 : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 5 6 4 5 6 5 5 6 ...
Save as RDS
folder = "DataCleanAndFormatted"
# Check if the folder exists
if (!dir.exists(folder)) {
# Create the folder
dir.create(folder)
saveRDS(ClerkshipGrades_clean,file.path(folder,"
ClerkshipGrades_formatted.RDS"))
} else {
saveRDS(ClerkshipGrades_clean,file.path(folder,"ClerkshipGrades_formatted.RDS"))
}
THIRD DATASET - subset of crime data
#install.packages("jsonlite")
#library(jsonlite)
#endPoint="https://data.lacity.org/resource/2nrs-mtv8.json"
#LA_Crime_data = fromJSON(endPoint)
#View(LA_Crime_data)
#write.csv(LA_Crime_data, "../dataFiles/Crime_Data_subset.csv")
Social Media and Mental Health